##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
The data we use is comming from a competive tetris ladder https://ch.tetr.io/ The data was webscraped via the api https://ch.tetr.io/api/users/by/league?limit=50&after=23731.327577078657%3A0%3A1e-10
After scraping we are now ready to look into the data:
#make some things factors
data$Country = as.factor(data$Country)
data$Country = sub("Korea, Republic of", "Republic of Korea", data$Country)
data$Country = sub("Venezuela, Bolivarian Republic of", "Republic of Venezuela", data$Country)
data$Country = sub("Macedonia, the former Yugoslav Republic of", "Republic of Macedonia", data$Country)
data$Rank = factor(data$Rank, levels=c("D","D+","C-","C","C+","B-","B","B+","A-","A","A+","S-","S","S+","SS","U","X","X+"))
data$Active.This.Week = as.factor(data$Active.This.Week)
data$Active.This.Week = ifelse(data$Active.This.Week == "Yes", 1, 0)
data$Supporter.Status. = as.factor(data$Supporter.Status.)
data$Supporter.Status. = ifelse(data$Supporter.Status. == "Yes", 1, 0)
data$Wins = as.numeric(data$Wins)
data$Games.Played = as.numeric(data$Games.Played)
data$Username = as.character(data$Username)
#remove index (standing does this)
data$X = NULL## Standing Username Country Wins
## Min. : 1 Length:39769 Length:39769 Min. : 0.0
## 1st Qu.: 9943 Class :character Class :character 1st Qu.: 31.0
## Median :19885 Mode :character Mode :character Median : 83.0
## Mean :19885 Mean : 158.7
## 3rd Qu.:29827 3rd Qu.: 202.0
## Max. :39769 Max. :4001.0
##
## Games.Played Winrate APM PPS
## Min. : 10.0 Min. :0.0000 Min. : 1.05 Min. :0.300
## 1st Qu.: 63.0 1st Qu.:0.4844 1st Qu.: 15.11 1st Qu.:0.940
## Median : 159.0 Median :0.5087 Median : 23.10 Median :1.170
## Mean : 311.2 Mean :0.4951 Mean : 30.56 Mean :1.259
## 3rd Qu.: 389.0 3rd Qu.:0.5327 3rd Qu.: 38.13 3rd Qu.:1.480
## Max. :8142.0 Max. :1.0000 Max. :227.68 Max. :4.270
##
## VS Glicko.Rating Rating.Deviation Tetra.Rating
## Min. : 1.75 Min. : 265 Min. : 60.00 Min. : 11.47
## 1st Qu.: 32.44 1st Qu.:1168 1st Qu.: 62.00 1st Qu.: 4531.12
## Median : 49.86 Median :1479 Median : 72.00 Median : 9509.57
## Mean : 64.76 Mean :1496 Mean : 74.94 Mean : 9862.82
## 3rd Qu.: 81.44 3rd Qu.:1774 3rd Qu.: 86.00 3rd Qu.:14693.09
## Max. :438.21 Max. :4276 Max. :100.00 Max. :24752.28
##
## Rank Active.This.Week Supporter.Status. RankColour
## B : 3182 Min. :0.0000 Min. :0.00000 Length:39769
## A- : 3182 1st Qu.:0.0000 1st Qu.:0.00000 Class :character
## A+ : 3182 Median :1.0000 Median :0.00000 Mode :character
## B- : 3181 Mean :0.5722 Mean :0.02721
## B+ : 3181 3rd Qu.:1.0000 3rd Qu.:0.00000
## A : 3181 Max. :1.0000 Max. :1.00000
## (Other):20680
## [1] "Standing" "Username" "Country"
## [4] "Wins" "Games.Played" "Winrate"
## [7] "APM" "PPS" "VS"
## [10] "Glicko.Rating" "Rating.Deviation" "Tetra.Rating"
## [13] "Rank" "Active.This.Week" "Supporter.Status."
## [16] "RankColour"
## 'data.frame': 39769 obs. of 16 variables:
## $ Standing : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Username : chr "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
## $ Country : chr "Japan" "United States" "Republic of Korea" "Japan" ...
## $ Wins : num 1026 347 511 358 320 ...
## $ Games.Played : num 1233 394 670 437 454 ...
## $ Winrate : num 0.832 0.881 0.763 0.819 0.705 ...
## $ APM : num 228 213 195 203 191 ...
## $ PPS : num 4.27 3.92 3.39 3.83 3.42 3.84 3.44 3.56 3.04 3.84 ...
## $ VS : num 438 421 389 394 391 ...
## $ Glicko.Rating : int 4276 4026 3963 3944 3931 3899 3862 3854 3853 3767 ...
## $ Rating.Deviation : int 85 71 72 76 72 81 67 65 86 68 ...
## $ Tetra.Rating : num 24752 24641 24601 24591 24579 ...
## $ Rank : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 18 18 18 18 18 18 ...
## $ Active.This.Week : num 1 1 1 0 1 1 1 1 1 1 ...
## $ Supporter.Status.: num 1 1 0 1 1 1 0 1 1 0 ...
## $ RankColour : chr "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
## Standing Username Country Wins
## "integer" "character" "character" "numeric"
## Games.Played Winrate APM PPS
## "numeric" "numeric" "numeric" "numeric"
## VS Glicko.Rating Rating.Deviation Tetra.Rating
## "numeric" "integer" "integer" "numeric"
## Rank Active.This.Week Supporter.Status. RankColour
## "factor" "numeric" "numeric" "character"
# numeric variables
num_vars <- sapply(data, is.numeric)
summary(data[ , num_vars]) # min–max, quartiles, mean## Standing Wins Games.Played Winrate
## Min. : 1 Min. : 0.0 Min. : 10.0 Min. :0.0000
## 1st Qu.: 9943 1st Qu.: 31.0 1st Qu.: 63.0 1st Qu.:0.4844
## Median :19885 Median : 83.0 Median : 159.0 Median :0.5087
## Mean :19885 Mean : 158.7 Mean : 311.2 Mean :0.4951
## 3rd Qu.:29827 3rd Qu.: 202.0 3rd Qu.: 389.0 3rd Qu.:0.5327
## Max. :39769 Max. :4001.0 Max. :8142.0 Max. :1.0000
## APM PPS VS Glicko.Rating
## Min. : 1.05 Min. :0.300 Min. : 1.75 Min. : 265
## 1st Qu.: 15.11 1st Qu.:0.940 1st Qu.: 32.44 1st Qu.:1168
## Median : 23.10 Median :1.170 Median : 49.86 Median :1479
## Mean : 30.56 Mean :1.259 Mean : 64.76 Mean :1496
## 3rd Qu.: 38.13 3rd Qu.:1.480 3rd Qu.: 81.44 3rd Qu.:1774
## Max. :227.68 Max. :4.270 Max. :438.21 Max. :4276
## Rating.Deviation Tetra.Rating Active.This.Week Supporter.Status.
## Min. : 60.00 Min. : 11.47 Min. :0.0000 Min. :0.00000
## 1st Qu.: 62.00 1st Qu.: 4531.12 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 72.00 Median : 9509.57 Median :1.0000 Median :0.00000
## Mean : 74.94 Mean : 9862.82 Mean :0.5722 Mean :0.02721
## 3rd Qu.: 86.00 3rd Qu.:14693.09 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :100.00 Max. :24752.28 Max. :1.0000 Max. :1.00000
#drop usernames to avoid this output getting to long
data_no_username = data %>% select(-Username)
# categorical variables
cat_vars <- sapply(data_no_username, is.factor) | sapply(data_no_username, is.character)
lapply(data_no_username[ , cat_vars], table) # frequency tables## $Country
##
##
## 447
## Afghanistan
## 2
## Åland Islands
## 1
## Albania
## 3
## Algeria
## 6
## American Samoa
## 1
## Andorra
## 7
## Angola
## 2
## Anguilla
## 2
## Antarctica
## 87
## Antigua and Barbuda
## 3
## Argentina
## 497
## Armenia
## 5
## Australia
## 1333
## Austria
## 45
## Azerbaijan
## 10
## Bahamas
## 3
## Bahrain
## 10
## Bangladesh
## 14
## Barbados
## 4
## Belarus
## 26
## Belgium
## 201
## Belize
## 4
## Benin
## 1
## Bermuda
## 2
## Bhutan
## 1
## Bolivia, Plurinational State of
## 69
## Bosnia and Herzegovina
## 4
## Botswana
## 1
## Bouvet Island
## 2
## Brazil
## 1160
## British Indian Ocean Territory
## 1
## Brunei Darussalam
## 44
## Bulgaria
## 37
## Burkina Faso
## 1
## Burundi
## 1
## Cambodia
## 38
## Canada
## 1743
## Cayman Islands
## 1
## Central African Republic
## 1
## Chad
## 2
## Chile
## 399
## China
## 539
## Christmas Island
## 5
## Cocos (Keeling) Islands
## 4
## Colombia
## 240
## Congo
## 1
## Cook Islands
## 6
## Costa Rica
## 37
## Côte d'Ivoire
## 3
## Croatia
## 26
## Cuba
## 3
## Curaçao
## 1
## Cyprus
## 4
## Czech Republic
## 74
## Denmark
## 113
## Djibouti
## 1
## Dominican Republic
## 25
## Ecuador
## 59
## Egypt
## 22
## El Salvador
## 13
## England
## 6
## Equatorial Guinea
## 1
## Eritrea
## 1
## Estonia
## 27
## Europe
## 8
## Faroe Islands
## 2
## Fiji
## 2
## Finland
## 85
## France
## 414
## French Guiana
## 3
## French Southern Territories
## 2
## Gabon
## 1
## Gambia
## 1
## Georgia
## 17
## Germany
## 413
## Ghana
## 1
## Gibraltar
## 2
## Greece
## 47
## Greenland
## 2
## Grenada
## 1
## Guam
## 4
## Guatemala
## 28
## Guernsey
## 1
## Guinea-Bissau
## 1
## Guyana
## 1
## Haiti
## 1
## Heard Island and McDonald Islands
## 2
## Holy See (Vatican City State)
## 11
## Honduras
## 8
## Hong Kong
## 628
## Hungary
## 41
## Iceland
## 8
## India
## 106
## Indonesia
## 881
## Iran, Islamic Republic of
## 9
## Iraq
## 5
## Ireland
## 38
## Isle of Man
## 3
## Israel
## 74
## Italy
## 212
## Jamaica
## 4
## Japan
## 1689
## Jordan
## 13
## Kazakhstan
## 79
## Kenya
## 8
## Korea, Democratic People's Republic of
## 18
## Kosovo
## 1
## Kuwait
## 1
## Kyrgyzstan
## 6
## Lao People's Democratic Republic
## 9
## Latvia
## 43
## Lebanon
## 8
## Lesotho
## 2
## Libya
## 2
## Lithuania
## 37
## Luxembourg
## 2
## Macao
## 17
## Madagascar
## 1
## Malawi
## 1
## Malaysia
## 1565
## Maldives
## 3
## Malta
## 6
## Marshall Islands
## 2
## Martinique
## 3
## Mauritius
## 3
## Mayotte
## 3
## Mexico
## 481
## Micronesia, Federated States of
## 1
## Moldova, Republic of
## 6
## Monaco
## 4
## Mongolia
## 1154
## Montenegro
## 1
## Morocco
## 20
## Myanmar
## 4
## Namibia
## 1
## Nauru
## 2
## Nepal
## 8
## Netherlands
## 190
## New Caledonia
## 2
## New Zealand
## 244
## Nicaragua
## 2
## Niger
## 1
## Nigeria
## 3
## Niue
## 3
## Norfolk Island
## 1
## Northern Ireland
## 2
## Northern Mariana Islands
## 1
## Norway
## 86
## Oman
## 3
## Pakistan
## 18
## Palau
## 1
## Palestine
## 3
## Panama
## 26
## Paraguay
## 9
## Peru
## 381
## Philippines
## 3898
## Poland
## 227
## Portugal
## 141
## Puerto Rico
## 8
## Qatar
## 13
## Republic of Korea
## 5287
## Republic of Macedonia
## 9
## Republic of Venezuela
## 48
## Réunion
## 5
## Romania
## 59
## Russian Federation
## 534
## Saint Barthélemy
## 1
## Saint Helena, Ascension and Tristan da Cunha
## 2
## Saint Lucia
## 2
## Saint Martin
## 1
## Saint Pierre and Miquelon
## 2
## Samoa
## 1
## San Marino
## 2
## Saudi Arabia
## 29
## Scotland
## 8
## Senegal
## 4
## Serbia
## 32
## Seychelles
## 1
## Sierra Leone
## 1
## Singapore
## 855
## Slovakia
## 28
## Slovenia
## 18
## Somalia
## 3
## South Africa
## 34
## South Georgia and the South Sandwich Islands
## 1
## Spain
## 398
## Sri Lanka
## 2
## Sudan
## 1
## Suriname
## 1
## Svalbard and Jan Mayen Islands
## 2
## Swaziland
## 1
## Sweden
## 126
## Switzerland
## 53
## Syrian Arab Republic
## 4
## Taiwan
## 2877
## Tajikistan
## 1
## Tanzania, United Republic of
## 2
## Thailand
## 184
## Togo
## 2
## Tokelau
## 1
## Tonga
## 1
## Trinidad and Tobago
## 5
## Tunisia
## 11
## Turkey
## 244
## Turkmenistan
## 1
## Turks and Caicos Islands
## 2
## Tuvalu
## 3
## Ukraine
## 79
## United Arab Emirates
## 56
## United Kingdom
## 598
## United States
## 6281
## Uruguay
## 34
## US Minor Outlying Islands
## 8
## Uzbekistan
## 2
## Vanuatu
## 1
## Vietnam
## 833
## Virgin Islands, British
## 2
## Virgin Islands, U.S.
## 7
## Wales
## 2
## Wallis and Futuna Islands
## 2
## Western Sahara
## 2
## Yemen
## 3
## Zambia
## 1
## Zimbabwe
## 3
##
## $Rank
##
## D D+ C- C C+ B- B B+ A- A A+ S- S S+ SS U
## 995 994 1988 2387 2386 3181 3182 3181 3182 3181 3182 2784 2386 2386 2386 1591
## X X+
## 318 79
##
## $RankColour
##
## #1FA834 #3BB687 #46AD51 #4F64C9 #4F99C0 #552883 #5650C7 #6C496E #733E8F #79558C
## 3182 3182 3181 3182 3181 2386 3181 994 2387 1988
## #907591 #A763EA #B2972B #D8AF0E #DB8B1F #E0A71B #FF3813 #FF45FF
## 995 79 2784 2386 2386 2386 1591 318
## Rows: 39,769
## Columns: 16
## $ Standing <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ Username <chr> "5HAN", "CABOOZLED_PIE", "TURTLE", "SYAKEGOHAN", "VI…
## $ Country <chr> "Japan", "United States", "Republic of Korea", "Japa…
## $ Wins <dbl> 1026, 347, 511, 358, 320, 270, 179, 323, 164, 356, 9…
## $ Games.Played <dbl> 1233, 394, 670, 437, 454, 358, 229, 463, 206, 533, 1…
## $ Winrate <dbl> 0.8321, 0.8807, 0.7627, 0.8192, 0.7048, 0.7542, 0.78…
## $ APM <dbl> 227.68, 213.34, 194.60, 202.60, 190.93, 203.54, 194.…
## $ PPS <dbl> 4.27, 3.92, 3.39, 3.83, 3.42, 3.84, 3.44, 3.56, 3.04…
## $ VS <dbl> 438.21, 420.66, 388.93, 393.83, 391.01, 400.73, 388.…
## $ Glicko.Rating <int> 4276, 4026, 3963, 3944, 3931, 3899, 3862, 3854, 3853…
## $ Rating.Deviation <int> 85, 71, 72, 76, 72, 81, 67, 65, 86, 68, 65, 64, 75, …
## $ Tetra.Rating <dbl> 24752.28, 24640.67, 24601.05, 24591.11, 24579.48, 24…
## $ Rank <fct> X+, X+, X+, X+, X+, X+, X+, X+, X+, X+, X+, X+, X+, …
## $ Active.This.Week <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1…
## $ Supporter.Status. <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1…
## $ RankColour <chr> "#A763EA", "#A763EA", "#A763EA", "#A763EA", "#A763EA…
| Name | data |
| Number of rows | 39769 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| factor | 1 |
| numeric | 12 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Username | 0 | 1 | 1 | 16 | 0 | 39764 | 0 |
| Country | 0 | 1 | 0 | 44 | 447 | 225 | 0 |
| RankColour | 0 | 1 | 7 | 7 | 0 | 18 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Rank | 0 | 1 | FALSE | 18 | B: 3182, A-: 3182, A+: 3182, B-: 3181 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Standing | 0 | 1 | 19885.00 | 11480.47 | 1.00 | 9943.00 | 19885.00 | 29827.00 | 39769.00 | ▇▇▇▇▇ |
| Wins | 0 | 1 | 158.71 | 212.86 | 0.00 | 31.00 | 83.00 | 202.00 | 4001.00 | ▇▁▁▁▁ |
| Games.Played | 0 | 1 | 311.19 | 422.82 | 10.00 | 63.00 | 159.00 | 389.00 | 8142.00 | ▇▁▁▁▁ |
| Winrate | 0 | 1 | 0.50 | 0.09 | 0.00 | 0.48 | 0.51 | 0.53 | 1.00 | ▁▁▇▁▁ |
| APM | 0 | 1 | 30.56 | 23.13 | 1.05 | 15.11 | 23.10 | 38.13 | 227.68 | ▇▂▁▁▁ |
| PPS | 0 | 1 | 1.26 | 0.45 | 0.30 | 0.94 | 1.17 | 1.48 | 4.27 | ▇▇▂▁▁ |
| VS | 0 | 1 | 64.76 | 47.78 | 1.75 | 32.44 | 49.86 | 81.44 | 438.21 | ▇▂▁▁▁ |
| Glicko.Rating | 0 | 1 | 1496.23 | 448.56 | 265.00 | 1168.00 | 1479.00 | 1774.00 | 4276.00 | ▂▇▂▁▁ |
| Rating.Deviation | 0 | 1 | 74.94 | 12.80 | 60.00 | 62.00 | 72.00 | 86.00 | 100.00 | ▇▃▂▂▂ |
| Tetra.Rating | 0 | 1 | 9862.82 | 6100.94 | 11.47 | 4531.12 | 9509.57 | 14693.09 | 24752.28 | ▇▇▇▅▂ |
| Active.This.Week | 0 | 1 | 0.57 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| Supporter.Status. | 0 | 1 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
Missing-value scan
## Standing Username Country Wins
## 0 0 0 0
## Games.Played Winrate APM PPS
## 0 0 0 0
## VS Glicko.Rating Rating.Deviation Tetra.Rating
## 0 0 0 0
## Rank Active.This.Week Supporter.Status. RankColour
## 0 0 0 0
## Standing Username Country Wins
## 0 0 0 0
## Games.Played Winrate APM PPS
## 0 0 0 0
## VS Glicko.Rating Rating.Deviation Tetra.Rating
## 0 0 0 0
## Rank Active.This.Week Supporter.Status. RankColour
## 0 0 0 0
#duplicates = list()
#k = 1
#
#for (i in 1:(nrow(data) - 1)) {
# for (j in (i + 1):nrow(data)) {
# if (data$Username[i] == data$Username[j]) {
# duplicates[[k]] = c(i, j)
# k = k + 1
# }
# }
#}## [1] 0
## [1] 5
#which(duplicated(data$Username))
#data$Username[c(11030, 11178, 11429, 12178, 25056)]
#data[c(11030, 11178, 11429, 12178, 25056), ]## Standing Username Country Wins
## Min. : 1 Length:39769 Length:39769 Min. : 0.0
## 1st Qu.: 9943 Class :character Class :character 1st Qu.: 31.0
## Median :19885 Mode :character Mode :character Median : 83.0
## Mean :19885 Mean : 158.7
## 3rd Qu.:29827 3rd Qu.: 202.0
## Max. :39769 Max. :4001.0
##
## Games.Played Winrate APM PPS
## Min. : 10.0 Min. :0.0000 Min. : 1.05 Min. :0.300
## 1st Qu.: 63.0 1st Qu.:0.4844 1st Qu.: 15.11 1st Qu.:0.940
## Median : 159.0 Median :0.5087 Median : 23.10 Median :1.170
## Mean : 311.2 Mean :0.4951 Mean : 30.56 Mean :1.259
## 3rd Qu.: 389.0 3rd Qu.:0.5327 3rd Qu.: 38.13 3rd Qu.:1.480
## Max. :8142.0 Max. :1.0000 Max. :227.68 Max. :4.270
##
## VS Glicko.Rating Rating.Deviation Tetra.Rating
## Min. : 1.75 Min. : 265 Min. : 60.00 Min. : 11.47
## 1st Qu.: 32.44 1st Qu.:1168 1st Qu.: 62.00 1st Qu.: 4531.12
## Median : 49.86 Median :1479 Median : 72.00 Median : 9509.57
## Mean : 64.76 Mean :1496 Mean : 74.94 Mean : 9862.82
## 3rd Qu.: 81.44 3rd Qu.:1774 3rd Qu.: 86.00 3rd Qu.:14693.09
## Max. :438.21 Max. :4276 Max. :100.00 Max. :24752.28
##
## Rank Active.This.Week Supporter.Status. RankColour
## B : 3182 Min. :0.0000 Min. :0.00000 Length:39769
## A- : 3182 1st Qu.:0.0000 1st Qu.:0.00000 Class :character
## A+ : 3182 Median :1.0000 Median :0.00000 Mode :character
## B- : 3181 Mean :0.5722 Mean :0.02721
## B+ : 3181 3rd Qu.:1.0000 3rd Qu.:0.00000
## A : 3181 Max. :1.0000 Max. :1.00000
## (Other):20680
#d1 = data[c(which(duplicated(data$Username))), 1]
#d2 = data[c(which(duplicated(data$Username, fromLast=TRUE))), 1]
#
#for (i in d1) for (j in d2) {
# if (data$Games.Played[i] > data$Games.Played[j])
# data = data[-j,]
# else
# data = data[-i,]
#}
d1 = data[c(which(duplicated(data$Username))), 1]
d2 = data[c(which(duplicated(data$Username, fromLast=TRUE))), 1]
to_remove = integer(0)
for (i in d1) {
for (j in d2) {
g1 = data$Games.Played[i]
g2 = data$Games.Played[j]
if (data$Username[i] != data$Username[j]) next
if (g1 > g2) to_remove = c(to_remove, j)
else if (g1 < g2) to_remove = c(to_remove, i)
}
}
data = data[-unique(to_remove), ]
data## [1] 0
## [1] 0
numeric_cols <- names(data)[num_vars]
par(mfrow = c(2, 3)) # 2×3 grid of histograms
for (v in numeric_cols[1:6]) { # first six just so it’s readable
hist(data[[v]], main = v, xlab = "")
}#plot(data$APM, data$Glicko.Rating, data$Winrate, data$PPS)
pairs(data[, c("APM", "Glicko.Rating", "Winrate", "PPS")], col=data$RankColour)plot_data = data.frame(data$APM, data$Glicko.Rating, data$Games.Played, data$Winrate, data$PPS)
plot(plot_data, cex=0.3, pch=19, col=data$RankColour)# Attacks per minute vs Glicko Rating
plot(data$APM, data$Glicko.Rating, cex=0.3, pch=19, col=data$RankColour)# Attacks per minute vs Winrate
plot(data$Games.Played, data$Winrate, cex=0.3, pch=19, col=data$RankColour)boxplot(data[ , numeric_cols],
las = 2, # vertical axis labels
cex.axis = 0.7) # shrink labels if many varslibrary(tidyr)
library(ggplot2)
data_long <- data %>%
pivot_longer(cols = all_of(numeric_cols),
names_to = "Variable",
values_to = "Value")
ggplot(data_long, aes(x = Variable, y = Value)) +
geom_boxplot(outlier.colour = "firebrick") +
facet_wrap(~ Variable, scales = "free_y", ncol = 4) +
theme_bw(base_size = 10) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())## [1] 21656
## [1] 21656
## [1] 1 2 4 6
## [1] 1 2
numeric_cols <- names(Filter(is.numeric, data))
#numeric_cols
big_vars <- c("Standing", "Tetra.Rating")
mid_vars <- c("Games.Played", "Wins", "Glicko.Rating")
small_vars <- setdiff(numeric_cols, c(big_vars, mid_vars))
stopifnot(all(big_vars %in% numeric_cols),
all(mid_vars %in% numeric_cols),
all(small_vars %in% numeric_cols))
par(mfrow = c(1, 3))
boxplot(data[ , big_vars],
main = "Big scale", las = 2)
boxplot(data[ , mid_vars],
main = "Mid scale", las = 2)
boxplot(data[ , small_vars],
main = "Small scale", las = 2)## 1. Pick out numeric columns --------------------------------------------
numeric_cols <- names(Filter(is.numeric, data))
## 2. Choose a grid that fits them all (3 columns is usually nice) ---------
nplots <- length(numeric_cols)
ncols <- 3
nrows <- ceiling(nplots / ncols)
par(mfrow = c(nrows, ncols), # grid of plots
mar = c(4, 4, 2, 0.5)) # tighten margins a bit
## 3. Loop over the variables ---------------------------------------------
for (v in numeric_cols) {
boxplot(data[[v]],
main = v,
ylab = "", # leave y-axis label blank
horizontal = TRUE) # optional: horizontal boxes
}#skewness function
skewness = function(variable) { ( 3*(mean(variable)-median(variable)) ) / sd(variable) }
hist(data$Glicko.Rating)## [1] 0.1151184
#data_numerical = data[,c(1,4,5,6,7,8,9,10,11,12,15,17)]
#str(data)
data_numerical = data[,c(1,4,5,6,7,8,9,10,11,12,15)]
for (v in names(data_numerical)) {
qqnorm(data_numerical[[v]],
main = v,
ylab = "Observed Quantiles",
xlab = "Theoretical Quantiles")
qqline(data_numerical[[v]], col = "red", lwd = 2)
}data_num_complete <- na.omit(data_numerical) # avoids NA issues
n <- nrow(data_num_complete)
p <- ncol(data_num_complete)
Sx <- cov(data_numerical)
D2 <- mahalanobis(data_numerical, colMeans(data_numerical), Sx)
#Theoretical χ² quantiles
chi_q <- qchisq(ppoints(n, a = 0.5), df = p)
# Chi-square Q-Q plot (multivariate normality)
qqplot(qchisq(ppoints(n, a = 0.5), df = p), D2,
ylab = "Mahalanobis Distance",
xlab = bquote("quantiles of " ~ chi[.(p)]^2))
abline(0, 1, col = "red", lwd = 2)
title(main = "Mahalanobis D²", font.main = 2)
A comparison of the Mahalanobis D^2 distances with the χ_12^2 reference
line shows a pronounced upward curvature and many large outliers,
indicating that the joint distribution of the 12 numeric variables
deviates substantially from multivariate normality.
#APM vs PPS
plot(data$PPS, data$APM, cex=0.5, pch=19, col=data$RankColour,
xlab="Pieces Per Second", ylab="Attack Per Minute",
main="APM vs PPS, Coloured by Rank")
linearFit = lm(data$APM ~ data$PPS)
abline(linearFit, lwd=2)##
## Call:
## lm(formula = data$APM ~ data$PPS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.474 -5.498 0.131 5.479 75.780
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -28.8916 0.1417 -203.9 <0.0000000000000002 ***
## data$PPS 47.2307 0.1061 445.2 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.454 on 39762 degrees of freedom
## Multiple R-squared: 0.8329, Adjusted R-squared: 0.8329
## F-statistic: 1.982e+05 on 1 and 39762 DF, p-value: < 0.00000000000000022
#Supporter by Rank
table = table(data$Supporter.Status., data$Rank)
proportions = proportions(table, margin=2)
#par(mfrow=c(2,1))
barplot(table, xlab="Rank", ylab="Frequency", col=c("#4d4d4d", "orange"),
main="Frequency of Supporters & Non-Supporters by Rank")
#box()
legend("topright", legend=c("Supporter", "Not Supporter"), col=c("orange", "#4d4d4d"), pch=15)barplot(proportions, xlab="Rank", ylab="Proportion", col=c("#4d4d4d", "orange"),
main="Proportion of Supporters & Non-Supporters by Rank")### Correlation
#help(cor)
#data$Country = as.factor(data$Country)
data$Rank_number = as.numeric(data$Rank)
#data$Rank_number
#str(data)
data_correlation = cor(data[,c(1,4,5,6,7,8,9,10,11,12,14,15,17)])
data_correlation## Standing Wins Games.Played Winrate APM
## Standing 1.0000000 -0.4408111 -0.41821733 -0.60516544 -0.8390488
## Wins -0.4408111 1.0000000 0.99892120 0.15088807 0.3844931
## Games.Played -0.4182173 0.9989212 1.00000000 0.12782667 0.3583682
## Winrate -0.6051654 0.1508881 0.12782667 1.00000000 0.4477224
## APM -0.8390488 0.3844931 0.35836818 0.44772242 1.0000000
## PPS -0.8448133 0.4024040 0.38002884 0.46739914 0.9126476
## VS -0.8582807 0.3914379 0.36553775 0.45494757 0.9949365
## Glicko.Rating -0.9602501 0.4219243 0.39557370 0.58659481 0.9241072
## Rating.Deviation 0.2637014 -0.3751242 -0.37069935 -0.19753213 -0.1982860
## Tetra.Rating -0.9955876 0.4431432 0.41968178 0.58239296 0.8820971
## Active.This.Week -0.1439440 0.2090427 0.20694240 0.03169982 0.1364815
## Supporter.Status. -0.1665577 0.1059904 0.09762496 0.09728786 0.2360381
## Rank_number -0.9944410 0.4403577 0.41705449 0.63463993 0.8527989
## PPS VS Glicko.Rating Rating.Deviation
## Standing -0.8448133 -0.8582807 -0.9602501 0.26370138
## Wins 0.4024040 0.3914379 0.4219243 -0.37512420
## Games.Played 0.3800288 0.3655377 0.3955737 -0.37069935
## Winrate 0.4673991 0.4549476 0.5865948 -0.19753213
## APM 0.9126476 0.9949365 0.9241072 -0.19828605
## PPS 1.0000000 0.9147989 0.8973381 -0.21256242
## VS 0.9147989 1.0000000 0.9390499 -0.20462546
## Glicko.Rating 0.8973381 0.9390499 1.0000000 -0.22153297
## Rating.Deviation -0.2125624 -0.2046255 -0.2215330 1.00000000
## Tetra.Rating 0.8731348 0.8999056 0.9749465 -0.25957441
## Active.This.Week 0.1421478 0.1409972 0.1455130 -0.50978822
## Supporter.Status. 0.2103876 0.2324646 0.2037424 -0.05309128
## Rank_number 0.8541937 0.8712542 0.9680742 -0.26893159
## Tetra.Rating Active.This.Week Supporter.Status. Rank_number
## Standing -0.9955876 -0.14394401 -0.16655774 -0.9944410
## Wins 0.4431432 0.20904274 0.10599042 0.4403577
## Games.Played 0.4196818 0.20694240 0.09762496 0.4170545
## Winrate 0.5823930 0.03169982 0.09728786 0.6346399
## APM 0.8820971 0.13648146 0.23603810 0.8527989
## PPS 0.8731348 0.14214783 0.21038758 0.8541937
## VS 0.8999056 0.14099717 0.23246457 0.8712542
## Glicko.Rating 0.9749465 0.14551299 0.20374241 0.9680742
## Rating.Deviation -0.2595744 -0.50978822 -0.05309128 -0.2689316
## Tetra.Rating 1.0000000 0.14632194 0.18145251 0.9925282
## Active.This.Week 0.1463219 1.00000000 0.04277167 0.1428549
## Supporter.Status. 0.1814525 0.04277167 1.00000000 0.1727455
## Rank_number 0.9925282 0.14285490 0.17274551 1.0000000
Highly Correlated variables: * Standing or Tetra.Rating * Games played or Wins * APM, PPS, VS, Glicko Rating * Rank_number
# The ones ADAM think should ve used
linearFit2 = lm(data$Glicko.Rating ~ data$Games.Played + data$APM + data$PPS + data$VS + data$Rating.Deviation + data$Active.This.Week + data$Supporter.Status.)
summary(linearFit2)##
## Call:
## lm(formula = data$Glicko.Rating ~ data$Games.Played + data$APM +
## data$PPS + data$VS + data$Rating.Deviation + data$Active.This.Week +
## data$Supporter.Status.)
##
## Residuals:
## Min 1Q Median 3Q Max
## -805.82 -90.05 23.08 103.27 403.73
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 764.606064 6.505819 117.526 < 0.0000000000000002 ***
## data$Games.Played 0.041823 0.001887 22.166 < 0.0000000000000002 ***
## data$APM -20.144351 0.301810 -66.745 < 0.0000000000000002 ***
## data$PPS 239.251542 3.902719 61.304 < 0.0000000000000002 ***
## data$VS 16.347619 0.147828 110.585 < 0.0000000000000002 ***
## data$Rating.Deviation -0.301614 0.066845 -4.512 0.00000643577 ***
## data$Active.This.Week -3.830898 1.640073 -2.336 0.0195 *
## data$Supporter.Status. -27.927844 4.412990 -6.329 0.00000000025 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 139 on 39756 degrees of freedom
## Multiple R-squared: 0.9039, Adjusted R-squared: 0.9039
## F-statistic: 5.344e+04 on 7 and 39756 DF, p-value: < 0.00000000000000022
# The ones we keep based on correlation
linearFit3 = lm(data$Glicko.Rating ~ data$Games.Played + data$VS + data$Rating.Deviation + data$Active.This.Week + data$Supporter.Status.)
summary(linearFit3)##
## Call:
## lm(formula = data$Glicko.Rating ~ data$Games.Played + data$VS +
## data$Rating.Deviation + data$Active.This.Week + data$Supporter.Status.)
##
## Residuals:
## Min 1Q Median 3Q Max
## -799.63 -97.96 31.41 115.37 363.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 959.824488 6.367757 150.732 < 0.0000000000000002 ***
## data$Games.Played 0.059467 0.002047 29.057 < 0.0000000000000002 ***
## data$VS 8.634738 0.017591 490.851 < 0.0000000000000002 ***
## data$Rating.Deviation -0.516538 0.072963 -7.079 0.00000000000147 ***
## data$Active.This.Week -2.339120 1.791228 -1.306 0.192
## data$Supporter.Status. -44.646927 4.813891 -9.275 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 151.9 on 39758 degrees of freedom
## Multiple R-squared: 0.8854, Adjusted R-squared: 0.8854
## F-statistic: 6.142e+04 on 5 and 39758 DF, p-value: < 0.00000000000000022
#here we try to predicto or estimate, a players number of games played
#linearFit4 = lm(data$Games.Played ~ data$Standing + data$Winrate + data$APM + data$PPS + data$VS + data$Glicko.Rating + data$Rating.Deviation + data$Tetra.Rating + data$Active.This.Week + data$Supporter.Status. + data$Rank_number)
linearFit4 = lm(data$Games.Played ~ data$APM + data$PPS + data$VS + data$Glicko.Rating + data$Rating.Deviation + data$Active.This.Week + data$Supporter.Status.)
summary(linearFit4)##
## Call:
## lm(formula = data$Games.Played ~ data$APM + data$PPS + data$VS +
## data$Glicko.Rating + data$Rating.Deviation + data$Active.This.Week +
## data$Supporter.Status.)
##
## Residuals:
## Min 1Q Median 3Q Max
## -900.5 -199.3 -67.5 92.2 7627.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 469.30997 19.81152 23.689 < 0.0000000000000002 ***
## data$APM -1.95136 0.84076 -2.321 0.020295 *
## data$PPS 154.28421 10.75888 14.340 < 0.0000000000000002 ***
## data$VS -0.29097 0.44658 -0.652 0.514697
## data$Glicko.Rating 0.29189 0.01317 22.166 < 0.0000000000000002 ***
## data$Rating.Deviation -9.56107 0.17000 -56.240 < 0.0000000000000002 ***
## data$Active.This.Week 8.17218 4.33291 1.886 0.059293 .
## data$Supporter.Status. 44.88804 11.66210 3.849 0.000119 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 367.3 on 39756 degrees of freedom
## Multiple R-squared: 0.2452, Adjusted R-squared: 0.2451
## F-statistic: 1845 on 7 and 39756 DF, p-value: < 0.00000000000000022
#Supporter
linearFit5 = lm(data$Supporter.Status. ~ data$APM + data$PPS + data$VS + data$Glicko.Rating + data$Rating.Deviation + data$Active.This.Week)
summary(linearFit5)##
## Call:
## lm(formula = data$Supporter.Status. ~ data$APM + data$PPS + data$VS +
## data$Glicko.Rating + data$Rating.Deviation + data$Active.This.Week)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32036 -0.03240 -0.01116 -0.00247 1.00981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.012688039 0.008519656 1.489 0.1364
## data$APM 0.002697576 0.000361315 7.466 0.0000000000000844 ***
## data$PPS 0.000116067 0.004626828 0.025 0.9800
## data$VS -0.000220984 0.000192048 -1.151 0.2499
## data$Glicko.Rating -0.000033626 0.000005661 -5.940 0.0000000028680277 ***
## data$Rating.Deviation -0.000071420 0.000073109 -0.977 0.3286
## data$Active.This.Week 0.003342003 0.001863281 1.794 0.0729 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.158 on 39757 degrees of freedom
## Multiple R-squared: 0.05734, Adjusted R-squared: 0.0572
## F-statistic: 403.1 on 6 and 39757 DF, p-value: < 0.00000000000000022
#calculate average stats for each rank
avgAPMs = tapply(data$APM, data$Rank, mean)
avgPPSs = tapply(data$PPS, data$Rank, mean)
avgVSs = tapply(data$VS, data$Rank, mean)
par(mfrow=c(1,3))
barplot(avgAPMs, main="avgAPMs by Rank")
barplot(avgPPSs, main="avgPPSs by Rank")
barplot(avgVSs, main="avgVSs by Rank")